library(MASS)
library(haven)
library(Matrix)
library(lfe)
library(ggplot2)
library(RColorBrewer)
library(Hotelling)
library(did)

source("functions.R")

### run data_cleaning.R first
load("data_cleaned.RData")

### K-means clustering using pretreatment time series
KmeansParameter <- list(spline=function(t){
  ans <- 1
  return(ans)
}
)  
KmeansParameter$eps <- 0
KmeansParameter$maxiter <- 200
KmeansParameter$n_of_initial_values <- 5000

# Use pretreatment outcomes only
SubData <- KmeansData[KmeansData$time<=DataCleaning$T0 & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
DataCleaning$N0 <- max(SubData$id)
SubData <- as.matrix(SubData)

# Choice of K
kmeansK2 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,2,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
kmeansK3 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,3,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
kmeansK4 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,4,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
KmeansParameter$BIC <- 
  c(kmeansK2$MSE[length(kmeansK2$MSE)] +
      (2*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)],
    kmeansK3$MSE[length(kmeansK3$MSE)] +
      (3*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)],
    kmeansK4$MSE[length(kmeansK4$MSE)] +
      (4*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)])

kmeansK2 <- extrapolate_gamma(T,kmeansK2,SubData,KmeansParameter$spline)
kmeansK3 <- extrapolate_gamma(T,kmeansK3,SubData,KmeansParameter$spline)
kmeansK4 <- extrapolate_gamma(T,kmeansK4,SubData,KmeansParameter$spline)

kmeansK2$typeTE <- retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)
kmeansK3$typeTE <- retrieve_delta(T,kmeansK3$delta,KmeansParameter$spline)
kmeansK4$typeTE <- retrieve_delta(T,kmeansK4$delta,KmeansParameter$spline)

kmeansK2$summary
kmeansK3$summary
kmeansK4$summary

kmeansK2$delta
kmeansK3$delta
kmeansK4$delta

# Table 3 of SA
KmeansParameter$BIC

# Table 4 of SA
table(cbind(kmeansK2$gamma*100+kmeansK3$gamma*10+kmeansK4$gamma))

# Table 2 of SA
temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK2$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
remove(temp)

temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK3$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
print(temp[temp$type==3,],n=N)
remove(temp)

temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
print(temp[temp$type==3,],n=N)
print(temp[temp$type==4,],n=N)
remove(temp)


### K-means clustering using only never-treated units
# Type classification by extrapolation
CrossValidationNT <- list(spline= function(t){
  ans <- c(1,(t>7),(t>13))
  return(ans)
}
)

SubData <- KmeansData[KmeansData$E>T & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
temp <- unique(SubData$id)
SubData$id <- sapply(SubData$id,function(x) which(x==temp))
remove(temp)
SubData <- as.matrix(SubData)
kmeansNT <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,2,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       CrossValidationNT$spline)
table(c((kmeansK2$gamma)[KmeansData$E[KmeansData$time==1]>T]*10+kmeansNT$gamma))

SubData <- KmeansData[KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
SubData <- as.matrix(SubData)
kmeansNT <- extrapolate_gamma(T,kmeansNT,SubData,CrossValidationNT$spline)
kmeansNT$typeTE <- retrieve_delta(T,kmeansNT$delta,CrossValidationNT$spline)

# Table 12 of SA
table(c(kmeansK2$gamma*10+kmeansNT$gamma))
table(c(kmeansK2$gamma*10+kmeansNT$gamma)[SubData[SubData[,3]==1,4]>T])

save.image("data_classified.RData")